library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Load dataset
nyt_bestseller <- read.csv("NYT_best_seller .csv", stringsAsFactors = FALSE)
# Filter for fiction books (assuming fiction-related categories contain 'Fiction' in 'list_name')
fiction_df <- subset(nyt_bestseller, grepl("Fiction", list_name, ignore.case = TRUE))
# Convert 'bestsellers_date' and 'published_date' to Date format
fiction_df$bestsellers_date <- as.Date(fiction_df$bestsellers_date, format="%m/%d/%y")
fiction_df$published_date <- as.Date(fiction_df$published_date, format="%m/%d/%y")
# Filter for books published between 2010 and 2016
fiction_df <- subset(fiction_df, format(published_date, "%Y") >= 2010 & format(published_date, "%Y") <= 2016)
# Calculate time difference in days between publication and best seller date
fiction_df$days_to_best_seller <- as.numeric(difftime(fiction_df$bestsellers_date, fiction_df$published_date, units="days"))
# Create improved scatter plot with trend line
ggplot(fiction_df, aes(x = published_date, y = days_to_best_seller)) +
geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) + # Add color gradient based on days
geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) + # Add trend line (LOESS)
scale_color_gradient(low = "blue", high = "red") + # Gradient from blue (low) to red (high)
scale_x_date(labels = date_format("%Y"), breaks = "1 year") + # Format x-axis to show years
labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
x = "Publication Date",
y = "Days to Become Bestseller",
color = "Days to Bestseller") +
theme_minimal(base_size = 14) + # Clean theme with larger text
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
This data set only had three categories: Trade Fiction, Mass Market, and Hardcover Fiction. This first graph looks at Trade Fiction and Hardcover Fiction best sellers published from years 2010-2016, and plots the days until days the books reach bestseller list. The range between publication date and days to bestseller is 14 to 15 days.
# Load necessary libraries
library(ggplot2)
library(scales) # For formatting dates
# Convert 'bestsellers_date' and 'published_date' to Date format
nyt_bestseller$bestsellers_date <- as.Date(nyt_bestseller$bestsellers_date, format="%m/%d/%y")
nyt_bestseller$published_date <- as.Date(nyt_bestseller$published_date, format="%m/%d/%y")
# Filter for books published between 2010 and 2016
nyt_bestseller <- subset(nyt_bestseller, format(published_date, "%Y") >= 2010 & format(published_date, "%Y") <= 2016)
# Calculate time difference in days between publication and best seller date
nyt_bestseller$days_to_best_seller <- as.numeric(difftime(nyt_bestseller$bestsellers_date, nyt_bestseller$published_date, units="days"))
# Create improved scatter plot with trend line
ggplot(nyt_bestseller, aes(x = published_date, y = days_to_best_seller)) +
geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) + # Add color gradient based on days
geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) +
scale_color_gradient(low = "blue", high = "red") + # Gradient from blue (low) to red (high)
scale_x_date(labels = date_format("%Y"), breaks = "1 year") + # Format x-axis to show months
labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
x = "Publication Date",
y = "Days to Become Bestseller",
color = "Days to Bestseller") +
theme_minimal(base_size = 14) + # Clean theme with larger text
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
## `geom_smooth()` using formula = 'y ~ x'
This graph looks at Hardcover Fiction,Trade Fiction, and Mass Market.
There is similiar range of days between publication date and reaching
bestseller list.
library(ggplot2)
library(scales)
ggplot(nyt_bestseller, aes(x = published_date, y = days_to_best_seller)) +
geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) +
geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) +
scale_color_gradient(low = "blue", high = "red") +
scale_x_date(labels = date_format("%b %Y"), breaks = date_breaks("3 months")) + # show abbreviated months and years
labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
x = "Publication Date",
y = "Days to Become Bestseller",
color = "Days to Bestseller") +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'
Added months to the y-axis for better readability.
library(dplyr)
print(nyt_bestseller$days_to_best_seller)
## [1] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [19] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [37] -14 -14 -14 -14 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [55] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [73] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [91] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [109] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [127] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [145] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [163] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [181] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [199] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [217] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [235] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -14 -14
## [253] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [271] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [289] -14 -14 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [307] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [325] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [343] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [361] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [379] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [397] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [415] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [433] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [451] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [469] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [487] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -14 -14 -14 -14
## [505] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [523] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [541] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [559] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [577] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [595] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [613] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [631] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [649] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [667] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [685] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [703] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [721] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [739] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [757] -15 -15 -15 -15
# Filter for 2010-2016
# Load required libraries
library(dplyr)
library(readr)
# Read the dataset
df <- read_csv("NYT_best_seller_08_16.csv", show_col_types = FALSE)
# Convert published_date to Date format (correcting MM/DD/YY format)
df$published_date <- as.Date(df$published_date, format="%m/%d/%y")
# Check date range before filtering
print(range(df$published_date, na.rm = TRUE))
## [1] "2008-06-08" "2016-06-12"
# Filter data to only include books published between 2010 and 2016
df_filtered <- df %>%
filter(published_date >= as.Date("2010-01-01") & published_date <= as.Date("2016-12-31"))
# Count the number of books per publisher
publisher_counts_2010_2016 <- df_filtered %>%
count(publisher, name = "n") %>%
arrange(desc(n))
# Ensure data exists before saving
print(dim(publisher_counts_2010_2016))
## [1] 115 2
print(head(publisher_counts_2010_2016))
## # A tibble: 6 × 2
## publisher n
## <chr> <int>
## 1 Grand Central 63
## 2 Bantam 54
## 3 Berkley 40
## 4 Vintage 28
## 5 Ballantine 27
## 6 Putnam 26
# Display filtered dataset
print(publisher_counts_2010_2016)
## # A tibble: 115 × 2
## publisher n
## <chr> <int>
## 1 Grand Central 63
## 2 Bantam 54
## 3 Berkley 40
## 4 Vintage 28
## 5 Ballantine 27
## 6 Putnam 26
## 7 Dell 25
## 8 Little, Brown 24
## 9 Simon & Schuster 20
## 10 Knopf 19
## # ℹ 105 more rows
ggplot(publisher_counts_2010_2016, aes(x = reorder(publisher, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Publisher Counts in NYT Best Sellers",
subtitle = "Data from 2010-2016",
x = "Publisher",
y = "Number of Best Sellers") +
theme_minimal() +
theme(
plot.title = element_text(size = 18, face = "bold"), # Increase title size
axis.title.x = element_text(size = 16), # Increase x-axis title size
axis.title.y = element_text(size = 16), # Increase y-axis title size
axis.text.y = element_text(size = 14), # Increase y-axis text size (publisher names)
axis.text.x = element_text(size = 14) # Increase x-axis text size
)
top_publishers <- publisher_counts_2010_2016 %>%
arrange(desc(n)) %>%
head(10) # Keep only the top 10 publishers
# Create the ggplot visualization
ggplot(top_publishers, aes(x = reorder(publisher, n), y = n)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(title = "Top 10 Publishers in NYT Best Sellers",
subtitle = "Data from 2010-2016",
x = "Publisher",
y = "Number of Best Sellers") +
theme_minimal()
Grand Central, Bantam, Berkley, Vintage, Ballantine, Putnam, Dell, Little, Brown, Simon & Schuster, Knopf have most NYT best sellers from 2010-2016
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)
# Convert published_date to Date format
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
# Extract month name from published_date
data$month <- format(data$published_date, "%B")
# Order months
data$month <- factor(data$month, levels = month.name, ordered = TRUE)
# Count number of books published per month
monthly_counts <- data %>%
group_by(month) %>%
summarise(count = n())
# Create bar chart
ggplot(monthly_counts, aes(x = month, y = count, fill = month)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(title = "Frequency of Bestseller Books Released Per Month",
x = "Month",
y = "Number of Bestsellers") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
updated_categories <- read.csv("Updated_Bestsellers_Data_Cleaned.csv")
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the updated dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)
# Convert the published_date column to Date format and extract the month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B") # Extract full month name
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Create the visualization
ggplot(data, aes(x = month, fill = New_Category)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribution of Bestsellers by Month",
x = "Month",
y = "Count of Bestsellers",
fill = "Category") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Count occurrences of each category per month
heatmap_data <- data %>%
count(month, New_Category)
# Create heatmap
ggplot(heatmap_data, aes(x = month, y = New_Category, fill = n)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal() +
labs(title = "Heatmap of Bestsellers by Month and Category",
x = "Month",
y = "Category",
fill = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)
# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Count occurrences of each category per month
area_chart_data <- data %>%
count(month, New_Category)
# Create stacked area chart
ggplot(area_chart_data, aes(x = month, y = n, fill = New_Category, group = New_Category)) +
geom_area(position = "stack", alpha = 0.7) +
theme_minimal() +
labs(title = "Stacked Area Chart of Bestsellers by Month",
x = "Month",
y = "Count of Bestsellers",
fill = "Category") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Per month
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)
# Convert published_date to Date format
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
# Extract month name from published_date
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name, ordered = TRUE)
# Count number of books published per month
monthly_counts <- data %>%
group_by(month) %>%
summarise(count = n())
# Create bar chart
ggplot(monthly_counts, aes(x = month, y = count, fill = month)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(title = "Frequency of Bestseller Books Released Per Month",
x = "Month",
y = "Number of Bestsellers") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Books released in February appear the least in the bestseller list, while October has the most.
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)
# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Count occurrences of each category per month
area_chart_data <- data %>%
count(month, New_Category)
# Create stacked area chart
ggplot(area_chart_data, aes(x = month, y = n, fill = New_Category, group = New_Category)) +
geom_area(position = "stack", alpha = 0.7) +
theme_minimal() +
labs(title = "Stacked Area Chart of Bestsellers by Month",
x = "Month",
y = "Count of Bestsellers",
fill = "Category") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Fiction, Manga/Graphic Novels, and Nonfiction categories have the most books on the bestseller list year round compared to the three other categories.
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)
# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Count occurrences of each category per month
heatmap_data <- data %>%
count(month, New_Category)
# Create heatmap
ggplot(heatmap_data, aes(x = month, y = New_Category, fill = n)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal() +
labs(title = "Heatmap of Bestsellers by Month and Category",
x = "Month",
y = "Category",
fill = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
There is little seasonality for books categorized as Self-Improvement and Other. Suprisingly, books categorized as Young Audiences peak on the bestseller list in October. Fiction books peak in March, April, and September and are most frequently on the bestseller list year round compared to other categories.
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)
# Convert the published_date column to Date format and extract the month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Create the visualization
ggplot(data, aes(x = month, fill = New_Category)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribution of Bestsellers by Month",
x = "Month",
y = "Count of Bestsellers",
fill = "Category") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Show distribution of each category per month. Can see that Younger Audiences, Self-Improvement, and Other have least amount of bestsellers year round. Limited seasonality in all categories.